slice <- readRDS("Pr_slice_decontX_filtered_annotated_final1.rds")
Warning: cannot open compressed file 'Pr_slice_decontX_filtered_annotated_final1.rds', probable reason 'No such file or directory'Error in gzfile(file, "rb") : cannot open the connection
whole <- readRDS("/Users/nityagupta/Desktop/GM project/best_objects/PrLN_whole_qc_annotated_final.rds")
slice <- readRDS("/Users/nityagupta/Desktop/GM project/best_objects/Pr_slice_decontX0.9_final_newUMAP.rds")
dittoDimPlot(whole, "subcluster_name", do.label = TRUE)
dittoDimPlot(slice, "subcluster_name", do.label = TRUE)
Comparing similar clusters between whole and slices
whole_clusters <- unique(whole$subcluster_name)
whole_result <- data.frame(row.names = rownames(whole))
for (subcluster_name in whole_clusters) {
subset <- whole[, whole$subcluster_name == subcluster_name]
counts <- as.matrix(subset@assays@data@listData[["logcounts"]])
subset_result <- as.data.frame(apply(counts, 1, median))
colnames(subset_result) <- subcluster_name
whole_result <- cbind(whole_result, subset_result)
}
slice_clusters <- unique(slice$subcluster_name)
slice_result <- data.frame(row.names = rownames(slice))
for (subcluster_name in slice_clusters) {
subset <- slice[, slice$subcluster_name == subcluster_name]
counts <- as.matrix(subset@assays@data@listData[["logcounts"]])
subset_result <- as.data.frame(apply(counts, 1, median))
colnames(subset_result) <- subcluster_name
slice_result <- cbind(slice_result, subset_result)
}
Warning: sparse->dense coercion: allocating vector of size 1.5 GiB
slice_colnames <- paste0("s_", colnames(slice_result))
colnames(slice_result) <- slice_colnames
whole_colnames <- paste0("w_", colnames(whole_result))
colnames(whole_result) <- whole_colnames
combined_result <- merge(slice_result, whole_result, by=0)
rownames(combined_result) <- combined_result$Row.names
combined_result$Row.names <- NULL
combined_result$s_gene <- NULL
allfeatures <- modelGeneVar(slice)
allfeatures[order(allfeatures$bio, decreasing=TRUE),]
DataFrame with 21761 rows and 6 columns
mean total tech bio p.value FDR
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
CFD 1.94998 5.82566 1.34184 4.48382 3.23325e-121 4.44120e-118
C7 2.13428 4.82406 1.46039 3.36366 9.69493e-59 5.70727e-56
COX2 1.40549 4.36895 1.06228 3.30667 1.72608e-105 1.87180e-102
BMT2 2.75746 3.99958 1.78233 2.21725 1.58857e-18 2.33792e-16
APOE 1.54751 3.32653 1.13495 2.19159 6.62930e-42 2.73180e-39
... ... ... ... ... ... ...
HNRNPA2B1 2.01947 0.953709 1.38656 -0.432852 0.985535 1.000000
HMGB1 2.42005 1.161545 1.62529 -0.463743 0.977065 1.000000
ENSMMUG00000041076 2.90478 1.361042 1.83527 -0.474232 0.964707 0.993394
RPS28 3.77411 1.744062 2.24789 -0.503824 0.941601 0.978906
ENSMMUG00000012140 3.17943 1.165070 1.93062 -0.765553 0.997237 1.000000
topn2000_features_slice <- getTopHVGs(allfeatures, n=2000)
allfeatures <- modelGeneVar(whole)
allfeatures[order(allfeatures$bio, decreasing=TRUE),]
DataFrame with 21761 rows and 6 columns
mean total tech bio p.value FDR
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
COX2 3.20818 7.38327 1.80138 5.58189 3.17251e-69 8.07721e-66
ENSMMUG00000015202 3.05643 7.01930 1.75967 5.25964 1.46456e-64 2.98302e-61
COX3 3.45263 6.20982 1.87262 4.33720 1.34914e-39 9.15979e-37
HSPB1 2.22329 5.64927 1.55555 4.09372 1.59917e-50 1.71432e-47
BMT2 2.06413 5.58766 1.51989 4.06777 3.50835e-52 4.20342e-49
... ... ... ... ... ... ...
PABPC1 2.32614 1.12469 1.58166 -0.456978 0.949084 0.985618
ENSMMUG00000001645 2.25431 1.10171 1.56382 -0.462112 0.952865 0.988487
MAMU-A 2.20016 1.08177 1.55064 -0.468871 0.956570 0.990464
SON 1.96683 1.02268 1.49797 -0.475295 0.963807 0.995291
HNRNPA2B1 2.55677 1.02860 1.64662 -0.618018 0.983219 1.000000
topn2000_features_whole <- getTopHVGs(allfeatures, n=2000)
union2000 <- c(topn2000_features_slice, topn2000_features_whole)
intersect2000 <- intersect(topn2000_features_slice, topn2000_features_whole)
length(intersect2000)
[1] 1245
combined_intersect <- combined_result
combined_intersect <- combined_intersect[rownames(combined_intersect) %in% intersect2000,]
combined_union <- combined_result
combined_union <- combined_union[rownames(combined_union) %in% intersect2000,]
corr_mat <- round(cor(combined_union),5)
corr_mat <- corr_mat[, colnames(slice_result)]
corr_mat <- corr_mat[colnames(whole_result), ]
heatmap_data <- data.frame(clusters <- rownames(corr_mat))
heatmap_data$cell_group <- ifelse(heatmap_data$clusters %in% c("w_PECAM1high_CAV1high_ESAMhigh_LEC", "w_PECAM1high_NT5E+_LEC", "w_PECAM1low_CCL20high_LEC", "s_PECAM1low_EFNB2high_LEC", "s_PECAM1high_CAV1high_LEC", "s_LYVE1+_LEC"), "Endothelial",
ifelse(heatmap_data$clusters %in% c("s_PI16+_Fibroblasts", "s_CCL19high_Fibroblasts", "s_PTPRGhigh_Fibroblasts", "s_CXCL12high_Fibroblasts", "s_CXCL12high_HDAC11+_Fibroblasts", "w_PTPRGhigh_Fibroblasts", "w_CXCL12high_Fibroblasts", "w_CCL19high_Fibroblasts", "w_PI16+_Fibroblasts"), "Fibroblasts",
ifelse(heatmap_data$clusters %in% c("s_T_cells", "s_B_cells", "s_BCL6high_cycling_B_cells", "s_Cycling_B_cells", "s_Plasma_cells", "w_ribo_high_T_cells", "w_T_regulatory_cells", "w_CD8_T_cells", "w_S100A6high_T_cells", "w_NK_cells", "w_Cycling_T_cells", "w_B_cells", "w_mito_high_T_and_B_cells", "w_Plasma_cells"), "Lymphoid",
ifelse(heatmap_data$clusters %in% c("w_Myofibroblasts", "w_Pericytes", "s_ITGA1high_ZEB2high_Pericytes", "s_Pericytes", "s_HDAC11+_cycling_Pericytes", "s_Myofibroblasts", "s_PEAR1+_Pericytes"), "Pericytes",
ifelse(heatmap_data$clusters %in% c("s_LAMP3+_CCL17high_DCs", "CLEC9Ahigh_DCs", "s_XCR1+_CLEC9Ahigh_DCs", "s_Neutrophils", "s_Mast_cells", "s_CD14+_Monocytes", "s_LAMP3+_DCs", "s_pDCs", "w_pDCs", "w_CD1C+_DCs", "w_XCR1+_DCs", "w_Mast_cells", "w_LAMP3+_DCs", "w_Neutrophils", "w_mito_high_myeloid", "w_CD14+_monocytes", "w_CD16+_monocytes"), "Myeloid",
"-")))))
heatmap_data1 <- data.frame(clusters <- colnames(corr_mat))
colnames(heatmap_data1) <- c("clusters")
heatmap_data1$cell_group <- ifelse(heatmap_data$clusters %in% c("w_PECAM1high_CAV1high_ESAMhigh_LEC", "w_PECAM1high_NT5E+_LEC", "w_PECAM1low_CCL20high_LEC", "s_PECAM1low_EFNB2high_LEC", "s_PECAM1high_CAV1high_LEC", "s_LYVE1+_LEC"), "Endothelial",
ifelse(heatmap_data$clusters %in% c("s_PI16+_Fibroblasts", "s_CCL19high_Fibroblasts", "s_PTPRGhigh_Fibroblasts", "s_CXCL12high_Fibroblasts", "s_CXCL12high_HDAC11+_Fibroblasts", "w_PTPRGhigh_Fibroblasts", "w_CXCL12high_Fibroblasts", "w_CCL19high_Fibroblasts", "w_PI16+_Fibroblasts"), "Fibroblasts",
ifelse(heatmap_data$clusters %in% c("s_T_cells", "s_B_cells", "s_BCL6high_cycling_B_cells", "s_Cycling_B_cells", "s_Plasma_cells", "w_ribo_high_T_cells", "w_T_regulatory_cells", "w_CD8_T_cells", "w_S100A6high_T_cells", "w_NK_cells", "w_Cycling_T_cells", "w_B_cells", "w_mito_high_T_and_B_cells", "w_Plasma_cells"), "Lymphoid",
ifelse(heatmap_data$clusters %in% c("w_Myofibroblasts", "w_Pericytes", "s_ITGA1high_ZEB2high_Pericytes", "s_Pericytes", "s_HDAC11+_cycling_Pericytes", "s_Myofibroblasts", "s_PEAR1+_Pericytes"), "Pericytes",
ifelse(heatmap_data$clusters %in% c("s_LAMP3+_CCL17high_DCs", "CLEC9Ahigh_DCs", "s_XCR1+_CLEC9Ahigh_DCs", "s_Neutrophils", "s_Mast_cells", "s_CD14+_Monocytes", "s_LAMP3+_DCs", "s_pDCs", "w_pDCs", "w_CD1C+_DCs", "w_XCR1+_DCs", "w_Mast_cells", "w_LAMP3+_DCs", "w_Neutrophils", "w_mito_high_myeloid", "w_CD14+_monocytes", "w_CD16+_monocytes"), "Myeloid",
"-")))))
Error in `$<-.data.frame`(`*tmp*`, cell_group, value = c("Lymphoid", "Lymphoid", :
replacement has 27 rows, data has 26

dev.off()
Error in dev.off() : cannot shut down device 1 (the null device)

SingleR - using the whole dataset as a referance to name the
slices
slice1 <- SingleR(test = slice, ref = whole, labels = whole$subcluster_name, de.method="wilcox")
png("/Users/nityagupta/Desktop/GM project/code/singleR_heatmap.png",width=14,height=7,units="in",res=1200)
plotScoreHeatmap(slice1)
Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.
dev.off()
null device
1

plotDeltaDistribution(slice1, ncol = 4, size = 1)
ggsave("singleR_delta_distribution.png", dpi = 300, width = 9)

slice$pruned.labels <- slice1$pruned.labels
dittoDimPlot(slice, "subcluster_name", do.label = TRUE, labels.size = 3, legend.size = 6, legend.title = "Cell Type")
dittoDimPlot(slice, "labels", do.label = TRUE, labels.size = 3, legend.size = 6, legend.title = "Cell Type")
dittoDimPlot(slice, "pruned.labels", do.label = TRUE, labels.size = 3, legend.size = 6, legend.title = "Cell Type")
library(RColorBrewer)
n <- 60
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
pie(rep(1,n), col=sample(col_vector, n))


slice_df2 <- slice_df1
slice_df2$Frequency <- 10
ggplot(slice_df2,
aes(axis1 = subcluster_name,
axis2 = labels,
y = Frequency)) +
geom_alluvium(aes(fill = match))+
geom_stratum() +
geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
expand = c(.1, .1)) +
labs(y = "Frequency") +
theme(axis.text.y = element_text(size = 50))+
theme_minimal()
Alluvial plot for fibroblasts

Alluvial plot for endothelial



diff_abundance_w <- as.data.frame(table(whole_da$sample_id, whole_da$subcluster_name))
diff_abundance_s <- as.data.frame(table(slice_da$sample_id, slice_da$subcluster_name))
diff_abundance <- rbind(diff_abundance_w, diff_abundance_s)
colnames(diff_abundance) <- c("sample_id", "cell_type", "number_of_cells")
diff_abundance <- separate(data = diff_abundance, col = "sample_id", c('donor_id', 'sample_type'), sep = "_", remove = FALSE)
diff_abundance$donor_id <- as.factor(diff_abundance$donor_id)
diff_abundance$sample_id <- as.factor(diff_abundance$sample_id)
diff_abundance$sample_type <- as.factor(diff_abundance$sample_type)
diff_abundance
Differential Abundance
whole_da <- whole[,whole$cluster_name %in% c("Fibroblasts", "Endothelial_cells", "Myeloid_cells", "Pericytes")]
slice_da <- slice[,slice$cluster_name %in% c("Fibroblasts", "Endothelial_cells", "Myeloid_cells", "Pericytes")]
whole_da <- colData(whole_da)
slice_da <- colData(slice_da)
whole_da <- whole_da[whole_da$donor_id %in% c("PrLN4", "PrLN5", "PrLN6"), ]
slice_da <- slice_da[slice_da$sample_id %in% c("PrLN4_sCTRL", "PrLN5_sCTRL", "PrLN6_sCTRL"),]
whole_da$sample_id <- droplevels(whole_da$sample_id)
whole_da$sample_type <- droplevels(whole_da$sample_type)
whole_da$donor_id <- droplevels(whole_da$donor_id)
slice_da$sample_id <- droplevels(slice_da$sample_id)
slice_da$sample_type <- droplevels(slice_da$sample_type)
table(whole_da$donor_id)
PrLN4 PrLN5 PrLN6
6142 1766 7516
table(slice_da$sample_id)
PrLN4_sCTRL PrLN5_sCTRL PrLN6_sCTRL
3736 4155 3240
table(whole_da$sample_type)
wCTRL
15424
table(whole_da$sample_id)
PrLN4_wCTRL PrLN5_wCTRL PrLN6_wCTRL
6142 1766 7516
table(slice_da$sample_type)
sCTRL
11131
diff_abundance_w <- as.data.frame(table(whole_da$sample_id, whole_da$subcluster_name))
diff_abundance_s <- as.data.frame(table(slice_da$sample_id, slice_da$subcluster_name))
diff_abundance <- rbind(diff_abundance_w, diff_abundance_s)
colnames(diff_abundance) <- c("sample_id", "cell_type", "number_of_cells")
diff_abundance <- separate(data = diff_abundance, col = "sample_id", c('donor_id', 'sample_type'), sep = "_", remove = FALSE)
diff_abundance$donor_id <- as.factor(diff_abundance$donor_id)
diff_abundance$sample_id <- as.factor(diff_abundance$sample_id)
diff_abundance$sample_type <- as.factor(diff_abundance$sample_type)
diff_abundance
to calculate the percentage we divide by the total number of cells in
the sample
cells_per_sample <- rbind(data.frame(table(slice_da$sample_id)), data.frame(table(whole_da$sample_id)))
colnames(cells_per_sample) <- c("sample_id", "cells_per_sample")
cells_per_sample
matching_rows <- match(diff_abundance$sample_id, cells_per_sample$sample_id)
diff_abundance$cells_per_sample <- cells_per_sample$cells_per_sample[matching_rows]
diff_abundance
diff_abundance$percentage_cells <- diff_abundance$number_of_cells/diff_abundance$cells_per_sample
diff_abundance
diff_abundance <- diff_abundance %>%
mutate(category = recode(cell_type, "CD14+_Monocytes" = "CD14+_monocytes"))
diff_abundance$cell_type[diff_abundance$cell_type == "CD14+_Monocytes"] <- "CD14+_monocytes"
ggplot(diff_abundance, aes(x = factor(cell_type, level = c("CD14+_monocytes", "CD16+_monocytes", "Neutrophils", "Mast_cells", "CD1C+_DCs", "CLEC9Ahigh_DCs", "XCR1+_CLEC9Ahigh_DCs", "XCR1+_DCs", "LAMP3+_DCs", "LAMP3+_CCL17high_DCs", "pDCs", "mito_high_myeloid", "PECAM1high_CAV1high_ESAMhigh_LEC", "PECAM1high_NT5E+_LEC", "PECAM1low_CCL20high_LEC", "PECAM1high_CAV1high_LEC", "PECAM1low_EFNB2high_LEC","LYVE1+_LEC", "CCL19high_Fibroblasts", "CXCL12high_Fibroblasts", "CXCL12high_HDAC11+_Fibroblasts","PI16+_Fibroblasts", "PTPRGhigh_Fibroblasts", "Pericytes", "ITGA1high_ZEB2high_Pericytes","HDAC11+_cycling_Pericytes", "PEAR1+_Pericytes", "Myofibroblasts")), y = percentage_cells, fill = sample_type)) +
geom_boxplot() +
labs(x = "Cell Type", y = "Proportion of cells (average across donors)", fill = "Treatment") +
theme(text = element_text(size = 15), axis.text.x = element_text(angle = 55, hjust = 1)) +
scale_fill_manual(values=c("#E78AC3", "#A1D99B"))

ggplot(diff_abundance, aes(x = factor(cell_type, level = c("CD14+_monocytes", "CD16+_monocytes", "Neutrophils", "Mast_cells", "CD1C+_DCs", "CLEC9Ahigh_DCs", "XCR1+_CLEC9Ahigh_DCs", "XCR1+_DCs", "LAMP3+_DCs", "LAMP3+_CCL17high_DCs", "pDCs", "mito_high_myeloid", "PECAM1high_CAV1high_ESAMhigh_LEC", "PECAM1high_NT5E+_LEC", "PECAM1low_CCL20high_LEC", "PECAM1high_CAV1high_LEC", "PECAM1low_EFNB2high_LEC","LYVE1+_LEC", "CCL19high_Fibroblasts", "CXCL12high_Fibroblasts", "CXCL12high_HDAC11+_Fibroblasts","PI16+_Fibroblasts", "PTPRGhigh_Fibroblasts", "Pericytes", "ITGA1high_ZEB2high_Pericytes","HDAC11+_cycling_Pericytes", "PEAR1+_Pericytes", "Myofibroblasts")), y = percentage_cells, fill = sample_type)) +
geom_boxplot() +
labs(x = "Cell Type", y = "Proportion of cells (average across donors)", fill = "Sample Type", col = "Sample ID") +
theme(text = element_text(size = 15), axis.text.x = element_text(angle = 55, hjust = 1),panel.background = element_rect(fill = "white", colour ="darkgray", size = 2), panel.grid.major = element_line(colour = "gray", size = 0.3))+
scale_fill_manual(values=c("#E78AC3", "#A1D99B"))+
geom_point(aes(col = sample_id), size = 2)
Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
Please use the `linewidth` argument instead.Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
Please use the `linewidth` argument instead.

NA
diff_abundance1 = diff_abundance[diff_abundance$cell_type %in% c("CD14+_monocytes", "Neutrophils", "Mast_cells", "LAMP3+_DCs", "pDCs", "CCL19high_Fibroblasts", "CXCL12high_Fibroblasts","PI16+_Fibroblasts", "PTPRGhigh_Fibroblasts", "Pericytes", "Myofibroblasts"),]
cd19.model <- lm(percentage_cells ~ cell_type + sample_type, data = diff_abundance1)
temp <- diff_abundance1 %>%
group_by(cell_type) %>%
anova_test(percentage_cells ~ sample_type, error = cd19.model)
temp
cd19.model <- lm(percentage_cells ~ cell_type*sample_type, data = diff_abundance1)
temp1 <- diff_abundance1 %>%
group_by(cell_type) %>%
anova_test(percentage_cells ~ sample_type, error = cd19.model)
temp1
pwc <- diff_abundance1 %>%
group_by(cell_type) %>%
emmeans_test(percentage_cells ~ sample_type, p.adjust.method = "bonferroni")
pwc
pwc1 <- subset(pwc, select = c(cell_type, statistic, p))
pwc1 <- pwc1[order(pwc1$p),]
colnames(pwc1) <- c("cell_type", "statistic", "p_value")
pwc1
diff_abundance_w <- as.data.frame(table(whole_da$sample_id, whole_da$cluster_name))
diff_abundance_s <- as.data.frame(table(slice_da$sample_id, slice_da$cluster_name))
diff_abundance <- rbind(diff_abundance_w, diff_abundance_s)
colnames(diff_abundance) <- c("sample_id", "cell_type", "number_of_cells")
diff_abundance <- separate(data = diff_abundance, col = "sample_id", c('donor_id', 'sample_type'), sep = "_", remove = FALSE)
diff_abundance$donor_id <- as.factor(diff_abundance$donor_id)
diff_abundance$sample_id <- as.factor(diff_abundance$sample_id)
diff_abundance$sample_type <- as.factor(diff_abundance$sample_type)
diff_abundance
to calculate the percentage we divide by the total number of cells in
the sample
cells_per_sample <- rbind(data.frame(table(slice_da$sample_id)), data.frame(table(whole_da$sample_id)))
colnames(cells_per_sample) <- c("sample_id", "cells_per_sample")
cells_per_sample
matching_rows <- match(diff_abundance$sample_id, cells_per_sample$sample_id)
diff_abundance$cells_per_sample <- cells_per_sample$cells_per_sample[matching_rows]
diff_abundance
diff_abundance$percentage_cells <- diff_abundance$number_of_cells/diff_abundance$cells_per_sample
diff_abundance
NA
cd19.model <- lm(percentage_cells ~ cell_type + sample_type, data = diff_abundance)
temp <- diff_abundance %>%
group_by(cell_type) %>%
anova_test(percentage_cells ~ sample_type, error = cd19.model)
temp
cd19.model <- lm(percentage_cells ~ cell_type*sample_type, data = diff_abundance)
temp1 <- diff_abundance %>%
group_by(cell_type) %>%
anova_test(percentage_cells ~ sample_type, error = cd19.model)
temp1
pwc <- diff_abundance %>%
group_by(cell_type) %>%
emmeans_test(percentage_cells ~ sample_type, p.adjust.method = "bonferroni")
pwc
NA
NA
ggplot(diff_abundance, aes(x = cell_type, y = percentage_cells, fill = sample_type)) +
geom_boxplot() +
labs(x = "Cell Type", y = "Proportion of cells (average across donors)", fill = "Treatment", col = "Sample ID") +
theme(text = element_text(size = 15), axis.text.x = element_text(angle = 55, hjust = 1)) +
scale_fill_manual(values=c("#E78AC3", "#A1D99B"))+
geom_point(aes(col = sample_id), size = 2)

ggplot(diff_abundance, aes(x = cell_type, y = percentage_cells, fill = sample_type)) +
geom_boxplot() +
labs(x = "Cell Type", y = "Proportion of cells (average across donors)", fill = "Sample Type", col = "Sample ID") +
theme(text = element_text(size = 15), axis.text.x = element_text(angle = 55, hjust = 1),panel.background = element_rect(fill = "white", colour ="darkgray", size = 2), panel.grid.major = element_line(colour = "gray", size = 0.3))+
scale_fill_manual(values=c("#E78AC3", "#A1D99B"))+
geom_point(aes(col = sample_id), size = 2)

#+geom_signif(stat = "identity", data = data.frame(x = c(0.12, 0.37), xend = c(0.17, ), y = c(0.4, 0.39), annotation = c("*", "**")), aes(x = x, xend = xend, y = y, yend = y, annotation = annotation))
cd19.model <- lm(percentage_cells ~ cell_type + sample_type, data = diff_abundance)
temp <- diff_abundance %>%
group_by(cell_type) %>%
anova_test(percentage_cells ~ sample_type, error = cd19.model)
temp
cd19.model <- lm(percentage_cells ~ cell_type*sample_type, data = diff_abundance)
temp1 <- diff_abundance %>%
group_by(cell_type) %>%
anova_test(percentage_cells ~ sample_type, error = cd19.model)
temp1
pwc <- diff_abundance %>%
group_by(cell_type) %>%
emmeans_test(percentage_cells ~ sample_type, p.adjust.method = "bonferroni")
pwc
pwc1 <- subset(pwc, select = c(cell_type, statistic, p))
pwc1 <- pwc1[order(pwc1$p),]
colnames(pwc1) <- c("cell_type", "statistic", "p_value")
pwc1
whole_da <- whole[,whole$cluster_name %in% c("Fibroblasts", "Endothelial_cells", "Myeloid_cells", "Pericytes")]
slice_da <- slice[,slice$cluster_name %in% c("Fibroblasts", "Endothelial_cells", "Myeloid_cells", "Pericytes")]
whole_da <- whole_da[whole_da$donor_id %in% c("PrLN4", "PrLN5", "PrLN6"), ]
slice_da <- slice_da[slice_da$sample_id %in% c("PrLN4_sCTRL", "PrLN5_sCTRL", "PrLN6_sCTRL"),]
Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE) :
logical subscript too long (25157, should be 21761)
---
title: "Whole Slice Comparisons"
output: html_notebook
---

```{r}
whole <- readRDS("Pr_whole_filtered_annotated_final.rds")
slice <- readRDS("Pr_slice_decontX_filtered_annotated_final1.rds")
```


```{r}
whole <- readRDS("/Users/nityagupta/Desktop/GM project/best_objects/PrLN_whole_qc_annotated_final.rds")
slice <- readRDS("/Users/nityagupta/Desktop/GM project/best_objects/Pr_slice_decontX0.9_final_newUMAP.rds")
```



```{r, fig.width=18}
dittoDimPlot(whole, "subcluster_name", do.label = TRUE)
dittoDimPlot(slice, "subcluster_name", do.label = TRUE)
```

### Comparing similar clusters between whole and slices

```{r}
whole_clusters <- unique(whole$subcluster_name)
whole_result <- data.frame(row.names = rownames(whole))


for (subcluster_name in whole_clusters) {
  subset <- whole[, whole$subcluster_name == subcluster_name]
  counts <- as.matrix(subset@assays@data@listData[["logcounts"]])
  subset_result <- as.data.frame(apply(counts, 1, median))
  colnames(subset_result) <- subcluster_name
  whole_result <- cbind(whole_result, subset_result)
}


```

```{r}
slice_clusters <- unique(slice$subcluster_name)
slice_result <- data.frame(row.names = rownames(slice))


for (subcluster_name in slice_clusters) {
  subset <- slice[, slice$subcluster_name == subcluster_name]
  counts <- as.matrix(subset@assays@data@listData[["logcounts"]])
  subset_result <- as.data.frame(apply(counts, 1, median))
  colnames(subset_result) <- subcluster_name
  slice_result <- cbind(slice_result, subset_result)
}


```

```{r}
slice_colnames <- paste0("s_", colnames(slice_result))
colnames(slice_result) <- slice_colnames

whole_colnames <- paste0("w_", colnames(whole_result))
colnames(whole_result) <- whole_colnames

```

```{r}
combined_result <- merge(slice_result, whole_result, by=0)
rownames(combined_result) <- combined_result$Row.names
combined_result$Row.names <- NULL
combined_result$s_gene <- NULL
```

```{r}
allfeatures <- modelGeneVar(slice)
allfeatures[order(allfeatures$bio, decreasing=TRUE),] 
topn2000_features_slice <- getTopHVGs(allfeatures, n=2000) 

allfeatures <- modelGeneVar(whole)
allfeatures[order(allfeatures$bio, decreasing=TRUE),] 
topn2000_features_whole <- getTopHVGs(allfeatures, n=2000) 


union2000 <- c(topn2000_features_slice, topn2000_features_whole)
intersect2000 <- intersect(topn2000_features_slice, topn2000_features_whole)
length(intersect2000)
```

```{r}
combined_intersect <- combined_result
combined_intersect <- combined_intersect[rownames(combined_intersect) %in% intersect2000,]
```

```{r}
combined_union <- combined_result
combined_union <- combined_union[rownames(combined_union) %in% intersect2000,]
```

```{r}
corr_mat <- round(cor(combined_union),5)

corr_mat <- corr_mat[, colnames(slice_result)]
corr_mat <- corr_mat[colnames(whole_result), ]
```

```{r}
heatmap_data <- data.frame(clusters <- rownames(corr_mat))
heatmap_data$cell_group <- ifelse(heatmap_data$clusters %in% c("w_PECAM1high_CAV1high_ESAMhigh_LEC", "w_PECAM1high_NT5E+_LEC", "w_PECAM1low_CCL20high_LEC", "s_PECAM1low_EFNB2high_LEC", "s_PECAM1high_CAV1high_LEC", "s_LYVE1+_LEC"), "Endothelial",
                                  ifelse(heatmap_data$clusters %in% c("s_PI16+_Fibroblasts", "s_CCL19high_Fibroblasts", "s_PTPRGhigh_Fibroblasts", "s_CXCL12high_Fibroblasts", "s_CXCL12high_HDAC11+_Fibroblasts", "w_PTPRGhigh_Fibroblasts", "w_CXCL12high_Fibroblasts", "w_CCL19high_Fibroblasts", "w_PI16+_Fibroblasts"), "Fibroblasts", 
                                         ifelse(heatmap_data$clusters %in% c("s_T_cells", "s_B_cells", "s_BCL6high_cycling_B_cells", "s_Cycling_B_cells", "s_Plasma_cells", "w_ribo_high_T_cells", "w_T_regulatory_cells", "w_CD8_T_cells", "w_S100A6high_T_cells", "w_NK_cells", "w_Cycling_T_cells", "w_B_cells", "w_mito_high_T_and_B_cells", "w_Plasma_cells"), "Lymphoid", 
                                                ifelse(heatmap_data$clusters %in% c("w_Myofibroblasts", "w_Pericytes", "s_ITGA1high_ZEB2high_Pericytes", "s_Pericytes", "s_HDAC11+_cycling_Pericytes", "s_Myofibroblasts", "s_PEAR1+_Pericytes"), "Pericytes",
                                                       ifelse(heatmap_data$clusters %in% c("s_LAMP3+_CCL17high_DCs", "CLEC9Ahigh_DCs", "s_XCR1+_CLEC9Ahigh_DCs", "s_Neutrophils", "s_Mast_cells", "s_CD14+_Monocytes", "s_LAMP3+_DCs", "s_pDCs", "w_pDCs", "w_CD1C+_DCs", "w_XCR1+_DCs", "w_Mast_cells", "w_LAMP3+_DCs", "w_Neutrophils", "w_mito_high_myeloid", "w_CD14+_monocytes", "w_CD16+_monocytes"), "Myeloid",
                                                 "-")))))


       
heatmap_data1 <- data.frame(clusters <- colnames(corr_mat))
colnames(heatmap_data1) <- c("clusters")
heatmap_data1$cell_group <- ifelse(heatmap_data1$clusters %in% c("w_PECAM1high_CAV1high_ESAMhigh_LEC", "w_PECAM1high_NT5E+_LEC", "w_PECAM1low_CCL20high_LEC", "s_PECAM1low_EFNB2high_LEC", "s_PECAM1high_CAV1high_LEC", "s_LYVE1+_LEC"), "Endothelial",
                                  ifelse(heatmap_data1$clusters %in% c("s_PI16+_Fibroblasts", "s_CCL19high_Fibroblasts", "s_PTPRGhigh_Fibroblasts", "s_CXCL12high_Fibroblasts", "s_CXCL12high_HDAC11+_Fibroblasts", "w_PTPRGhigh_Fibroblasts", "w_CXCL12high_Fibroblasts", "w_CCL19high_Fibroblasts", "w_PI16+_Fibroblasts"), "Fibroblasts", 
                                         ifelse(heatmap_data1$clusters %in% c("s_T_cells", "s_B_cells", "s_BCL6high_cycling_B_cells", "s_Cycling_B_cells", "s_Plasma_cells", "w_ribo_high_T_cells", "w_T_regulatory_cells", "w_CD8_T_cells", "w_S100A6high_T_cells", "w_NK_cells", "w_Cycling_T_cells", "w_B_cells", "w_mito_high_T_and_B_cells", "w_Plasma_cells"), "Lymphoid", 
                                                ifelse(heatmap_data1$clusters %in% c("w_Myofibroblasts", "w_Pericytes", "s_ITGA1high_ZEB2high_Pericytes", "s_Pericytes", "s_HDAC11+_cycling_Pericytes", "s_Myofibroblasts", "s_PEAR1+_Pericytes"), "Pericytes",
                                                       ifelse(heatmap_data1$clusters %in% c("s_LAMP3+_CCL17high_DCs", "s_CLEC9Ahigh_DCs", "s_XCR1+_CLEC9Ahigh_DCs", "s_Neutrophils", "s_Mast_cells", "s_CD14+_Monocytes", "s_LAMP3+_DCs", "s_pDCs", "w_pDCs", "w_CD1C+_DCs", "w_XCR1+_DCs", "w_Mast_cells", "w_LAMP3+_DCs", "w_Neutrophils", "w_mito_high_myeloid", "w_CD14+_monocytes", "w_CD16+_monocytes"), "Myeloid",
                                                 "-")))))



```

```{r, fig.width=15}
Heatmap(corr_mat, row_split = heatmap_data$cell_group, column_split = heatmap_data1$cell_group, show_column_dend = FALSE, show_row_dend = FALSE, row_title_rot = 0, top_annotation = HeatmapAnnotation(foo = anno_block(gp = gpar(fill = c("#46ACC8" , "#FFD92F" ,"#E5C494", "#A6D854", "#FD6467")))), left_annotation = rowAnnotation(foo = anno_block(gp = gpar(fill = c("#46ACC8" , "#FFD92F" ,"#E5C494", "#A6D854", "#FD6467")))))

```

```{r, fig.width=15}

png("/Users/nityagupta/Desktop/GM project/code/whole_slice_coorelation_heatmap.png",width=11,height=9,units="in",res=1200)

Heatmap(corr_mat, row_split = heatmap_data$cell_group, column_split = heatmap_data1$cell_group, show_column_dend = FALSE, show_row_dend = FALSE, row_title_rot = 0, bottom_annotation = HeatmapAnnotation(foo = anno_block(gp = gpar(fill = c("#FD6467" , "#FFD92F" ,"#E5C494", "#A6D854", "#46ACC8")), labels = c("Fibroblasts", "Pericytes", "LECs", "Lymphoid", "Myeloid")), show_legend = c(TRUE)), right_annotation = rowAnnotation(foo = anno_block(gp = gpar(fill = c("#FD6467" , "#FFD92F" ,"#E5C494", "#A6D854", "#46ACC8")), labels = c("Fibroblasts", "Peri", "LECs", "Lymphoid", "Myeloid"))), column_title = NULL, row_title = NULL, name = "Correlation score") + Legend(at = c("Fibroblasts", "Pericytes", "LECs", "Lymphoid", "Myeloid"), title = "Cell Type", legend_gp = gpar(fill = c("#46ACC8" , "#FFD92F" ,"#E5C494", "#A6D854", "#FD6467")))

dev.off()

png("/Users/nityagupta/Desktop/GM project/code/whole_slice_coorelation_heatmap_legend.png",width=11,height=9,units="in",res=1200)
l = Legend(at = c("Fibroblasts", "Pericytes", "Endothelial cells (LECs)", "Lymphoid cells", "Myeloid cells"), title = "Cell Type", legend_gp = gpar(fill = c("#46ACC8" , "#FFD92F" ,"#E5C494", "#A6D854", "#FD6467")))
draw(l)
dev.off()


```





```{r}
ha = HeatmapAnnotation(foo = anno_block(gp = gpar(fill = c("#46ACC8" , "#FFD92F" ,"#E5C494", "#A6D854", "#FD6467")), labels = c("Fibroblasts", "Pericytes", "LECs", "Lymphoid", "Myeloid")), annotation_legend_param = list(title = "Cell Type", at = c("Fibroblasts", "Pericytes", "LECs", "Lymphoid", "Myeloid"), labels = c("Fibroblasts", "Pericytes", "LECs", "Lymphoid", "Myeloid")), show_legend = TRUE)

Heatmap(corr_mat, row_split = heatmap_data$cell_group, column_split = heatmap_data1$cell_group, show_column_dend = FALSE, show_row_dend = FALSE, row_title_rot = 0, bottom_annotation = ha, right_annotation = rowAnnotation(foo = anno_block(gp = gpar(fill = c("#FD6467" , "#FFD92F" ,"#E5C494", "#A6D854", "#46ACC8")), labels = c("Fibroblasts", "Peri", "LECs", "Lymphoid", "Myeloid"))), column_title = NULL, row_title = NULL, name = "Correlation score") + Legend(at = c("Fibroblasts", "Pericytes", "LECs", "Lymphoid", "Myeloid"), title = "Cell Type", legend_gp = gpar(fill = c("#46ACC8" , "#FFD92F" ,"#E5C494", "#A6D854", "#FD6467")))
```


### SingleR - using the whole dataset as a referance to name the slices

```{r}
slice1 <- SingleR(test = slice, ref = whole, labels = whole$subcluster_name, de.method="wilcox")
```

```{r, fig.width=20}
png("/Users/nityagupta/Desktop/GM project/code/singleR_heatmap.png",width=14,height=7,units="in",res=1200)
plotScoreHeatmap(slice1)
dev.off()
```

```{r, fig.height=25}
celltype_mean <- aggregateAcrossCells(as(slice, "SingleCellExperiment"),  
                     ids = slice$labels, 
                     statistics = "mean",
                     use.assay.type = "logcounts")
png("/Users/nityagupta/Desktop/GM project/code/svw_heatmap.png",width=16.5,height=22,units="in",res=1200)
# No scaling

dittoHeatmap(celltype_mean,
             assay = "logcounts", 
             cluster_cols = TRUE, genes = unique(unlist(all.markers)),
             scale = "none",
             annot.by =  c("cluster_name","labels"), 
             order.by = c("cluster_name","labels"), scaled.to.max = TRUE)

dev.off()
```

```{r, fig.width=20}
plotDeltaDistribution(slice1, ncol = 4, size = 1) 
ggsave("singleR_delta_distribution.png", dpi = 300, width = 9)
```


```{r}
all.markers <- metadata(slice1)$de.genes
slice$labels <- slice1$labels

# Beta cell-related markers
library(scater)
plotHeatmap(slice, columns = "labels", order_columns_by="labels", exprs_values = "logcounts",
    features=unique(unlist(all.markers)))
plotHeatmap(sce, features = topn2000_features, columns = "sample_id")
plotHeatmap(sce, features=rownames(sce)[1:10])


dittoHeatmap(slice, unique(unlist(all.markers)), annot.by = "labels", order.by = "labels")
```


```{r}
slice$pruned.labels <- slice1$pruned.labels
```



```{r, fig.width=15}
dittoDimPlot(slice, "subcluster_name", do.label = TRUE, labels.size = 3, legend.size = 6, legend.title = "Cell Type")
dittoDimPlot(slice, "labels", do.label = TRUE, labels.size = 3, legend.size = 6, legend.title = "Cell Type")
dittoDimPlot(slice, "pruned.labels", do.label = TRUE, labels.size = 3, legend.size = 6, legend.title = "Cell Type")
```


```{r, fig.width=15}
library(ggalluvial)

slice_df <- colData(slice)
slice_df <- slice_df[,c("labels", "subcluster_name")]
slice_df1 <- slice_df %>% group_by(labels, subcluster_name) %>% summarise(Frequency = n())
slice_df1$match <- ifelse(slice_df1$subcluster_name == slice_df1$labels, "Yes", "No")

ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = Frequency)) +
  geom_alluvium(aes(fill = match))+
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Frequency") +
  theme(axis.text.y = element_text(size = 20))+
  theme_minimal()

```

```{r}
library(RColorBrewer)
n <- 60
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
pie(rep(1,n), col=sample(col_vector, n))
```



```{r, fig.width=15}
slice_df2 <- slice_df1

slice_df2$Frequency <- 10

ggplot(slice_df2,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = Frequency)) +
  geom_alluvium(aes(fill = match))+
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Frequency") +
  theme(axis.text.y = element_text(size = 50))+
  theme_minimal()
```

Alluvial plot for fibroblasts

```{r}
fibro <- slice[,slice$cluster_name == "Fibroblasts"]


slice_df <- as.data.frame(colData(fibro))
slice_df <- slice_df[,c("labels", "subcluster_name")]
slice_df1 <- slice_df %>% group_by(subcluster_name, labels) %>% summarise(Frequency = n())
slice_df1$Match <- ifelse(slice_df1$subcluster_name == slice_df1$labels, "Yes", "No")
slice_df1 <- slice_df1[slice_df1$Frequency >= 10, ]
slice_df1$logfreq <- log10(slice_df1$Frequency)
slice_df1$Frequencymatch <- ifelse(slice_df1$Match == "Yes", slice_df1$Frequency, 0-slice_df1$Frequency)
slice_df1$logFrequencymatch <- ifelse(slice_df1$Match == "Yes", slice_df1$logfreq, 0-slice_df1$logfreq)


ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
    geom_alluvium(aes(alpha = Frequency)) +
  geom_alluvium(aes(fill = Match)) + 
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]", alpha = "Number of Cells", fill = "Annotations match?") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))

#+scale_fill_gradient2(low="red", mid="white", high="blue", name = "log10[Cells]", limits = c(-500,8500))


ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
  geom_alluvium(aes(fill = logFrequencymatch))+
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))+
  scale_fill_gradient2(midpoint = 1, low="red", mid="orange", high="yellow", name = "log10[Cells]")

```


Alluvial plot for endothelial


```{r}
endo <- slice[,slice$cluster_name == "Endothelial_cells"]


slice_df <- as.data.frame(colData(endo))
slice_df <- slice_df[,c("labels", "subcluster_name")]
slice_df1 <- slice_df %>% group_by(subcluster_name, labels) %>% summarise(Frequency = n())
slice_df1$Match <- ifelse(slice_df1$subcluster_name == slice_df1$labels, "Yes", "No")
slice_df1 <- slice_df1[slice_df1$Frequency >= 10, ]
slice_df1$logfreq <- log10(slice_df1$Frequency)
slice_df1$Frequencymatch <- ifelse(slice_df1$Match == "Yes", slice_df1$Frequency, 0-slice_df1$Frequency)
slice_df1$logFrequencymatch <- ifelse(slice_df1$Match == "Yes", slice_df1$logfreq, 0-slice_df1$logfreq)

ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
    geom_alluvium(aes(alpha = Frequency)) +
  geom_alluvium(aes(fill = Match)) + 
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]", alpha = "Number of Cells", fill = "Annotations match?") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))

ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
  geom_alluvium(aes(fill = Frequencymatch))+
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))+
  scale_fill_gradient2(midpoint = 0, low="red", mid="orange", high="yellow", name = "log10[Cells]")


ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
  geom_alluvium(aes(fill = logFrequencymatch))+
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))+
  scale_fill_gradient2(midpoint = 1, low="red", mid="orange", high="yellow", name = "log10[Cells]")

```


```{r}
peri <- slice[,slice$cluster_name == "Pericytes"]


slice_df <- as.data.frame(colData(peri))
slice_df <- slice_df[,c("labels", "subcluster_name")]
slice_df1 <- slice_df %>% group_by(subcluster_name, labels) %>% summarise(Frequency = n())
slice_df1$Match <- ifelse(slice_df1$subcluster_name == slice_df1$labels, "Yes", "No")
slice_df1 <- slice_df1[slice_df1$Frequency >= 10, ]
slice_df1$logfreq <- log10(slice_df1$Frequency)
slice_df1$Frequencymatch <- ifelse(slice_df1$Match == "Yes", slice_df1$Frequency, 0-slice_df1$Frequency)
slice_df1$logFrequencymatch <- ifelse(slice_df1$Match == "Yes", slice_df1$logfreq, 0-slice_df1$logfreq)

ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
    geom_alluvium(aes(alpha = Frequency)) +
  geom_alluvium(aes(fill = Match)) + 
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]", alpha = "Number of Cells", fill = "Annotations match?") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))

ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
  geom_alluvium(aes(fill = Frequencymatch))+
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))+
  scale_fill_gradient2(midpoint = 0, low="red", mid="orange", high="yellow", name = "log10[Cells]")


ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
  geom_alluvium(aes(fill = logFrequencymatch))+
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))+
  scale_fill_gradient2(midpoint = 1, low="red", mid="orange", high="yellow", name = "log10[Cells]")

```

```{r}
myeloid <- slice[,slice$cluster_name == "Myeloid_cells"]

slice_df <- as.data.frame(colData(myeloid))
slice_df <- slice_df[,c("labels", "subcluster_name")]
slice_df1 <- slice_df %>% group_by(subcluster_name, labels) %>% summarise(Frequency = n())
slice_df1$subcluster_name[slice_df1$subcluster_name == "CD14+_Monocytes"] <- "CD14+_monocytes"
slice_df1$Match <- ifelse(slice_df1$subcluster_name == slice_df1$labels, "Yes", "No")
slice_df1 <- slice_df1[slice_df1$Frequency >= 10, ]
slice_df1$logfreq <- log10(slice_df1$Frequency)
slice_df1$Frequencymatch <- ifelse(slice_df1$Match == "Yes", slice_df1$Frequency, 0-slice_df1$Frequency)
slice_df1$logFrequencymatch <- ifelse(slice_df1$Match == "Yes", slice_df1$logfreq, 0-slice_df1$logfreq)




ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
    geom_alluvium(aes(alpha = Frequency)) +
  geom_alluvium(aes(fill = Match)) + 
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]", alpha = "Number of Cells", fill = "Annotations match?") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))

ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
  geom_alluvium(aes(fill = Frequencymatch))+
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))+
  scale_fill_gradient2(midpoint = 0, low="red", mid="orange", high="yellow", name = "log10[Cells]")


ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
  geom_alluvium(aes(fill = logFrequencymatch))+
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))+
  scale_fill_gradient2(midpoint = 1, low="red", mid="orange", high="yellow", name = "log10[Cells]")

```




```{r}
lymphoid <- slice[,slice$cluster_name == "Lymphoid_cells"]

slice_df <- as.data.frame(colData(lymphoid))
slice_df <- slice_df[,c("labels", "subcluster_name")]
slice_df1 <- slice_df %>% group_by(subcluster_name, labels) %>% summarise(Frequency = n())
slice_df1$Match <- ifelse(slice_df1$subcluster_name == slice_df1$labels, "Yes", "No")
slice_df1 <- slice_df1[slice_df1$Frequency >= 10, ]
slice_df1$logfreq <- log10(slice_df1$Frequency)
slice_df1$Frequencymatch <- ifelse(slice_df1$Match == "Yes", slice_df1$Frequency, 0-slice_df1$Frequency)
slice_df1$logFrequencymatch <- ifelse(slice_df1$Match == "Yes", slice_df1$logfreq, 0-slice_df1$logfreq)

ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
    geom_alluvium(aes(alpha = Frequency)) +
  geom_alluvium(aes(fill = Match)) + 
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]", alpha = "Number of Cells", fill = "Annotations match?") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))                


ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
  geom_alluvium(aes(fill = Frequencymatch))+
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))+
  scale_fill_gradient2(midpoint = 0, low="red", mid="orange", high="yellow", name = "log10[Cells]")


ggplot(slice_df1,
       aes(axis1 = subcluster_name,
           axis2 = labels,
           y = logfreq)) +
  geom_alluvium(aes(fill = logFrequencymatch))+
  geom_stratum() +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)), label.size = 0.01) +
  scale_x_discrete(limits = c("Manual Annotation", "SingleR Annotation"),
                   expand = c(.1, .1)) +
  labs(y = "Log10 [Cluster size]") +
  theme_minimal()+
  theme(text = element_text(size = 25), legend.text = element_text(size=10))+
  scale_fill_gradient2(midpoint = 1, low="red", mid="orange", high="yellow", name = "log10[Cells]")

```



### Differential Abundance


```{r}
whole_da <- whole[,whole$cluster_name %in% c("Fibroblasts", "Endothelial_cells", "Myeloid_cells", "Pericytes")]
slice_da <- slice[,slice$cluster_name %in% c("Fibroblasts", "Endothelial_cells", "Myeloid_cells", "Pericytes")]

whole_da <- colData(whole_da)
slice_da <- colData(slice_da)

whole_da <- whole_da[whole_da$donor_id %in% c("PrLN4", "PrLN5", "PrLN6"), ]
slice_da <- slice_da[slice_da$sample_id %in% c("PrLN4_sCTRL", "PrLN5_sCTRL", "PrLN6_sCTRL"),]

whole_da$sample_id <- droplevels(whole_da$sample_id)
whole_da$sample_type <- droplevels(whole_da$sample_type)
whole_da$donor_id <- droplevels(whole_da$donor_id)

slice_da$sample_id <- droplevels(slice_da$sample_id)
slice_da$sample_type <- droplevels(slice_da$sample_type)

table(whole_da$donor_id)
table(slice_da$sample_id)
table(whole_da$sample_type)
table(whole_da$sample_id)
table(slice_da$sample_type)
```

```{r}
diff_abundance_w <- as.data.frame(table(whole_da$sample_id, whole_da$subcluster_name))
diff_abundance_s <- as.data.frame(table(slice_da$sample_id, slice_da$subcluster_name))
diff_abundance <- rbind(diff_abundance_w, diff_abundance_s)


colnames(diff_abundance) <- c("sample_id", "cell_type", "number_of_cells")
diff_abundance <- separate(data = diff_abundance, col = "sample_id", c('donor_id', 'sample_type'), sep = "_", remove = FALSE)
diff_abundance$donor_id <- as.factor(diff_abundance$donor_id)
diff_abundance$sample_id <- as.factor(diff_abundance$sample_id)
diff_abundance$sample_type <- as.factor(diff_abundance$sample_type)
diff_abundance
```

to calculate the percentage we divide by the total number of cells in the sample

```{r}
cells_per_sample <- rbind(data.frame(table(slice_da$sample_id)), data.frame(table(whole_da$sample_id)))
colnames(cells_per_sample) <- c("sample_id", "cells_per_sample")
cells_per_sample
matching_rows <- match(diff_abundance$sample_id, cells_per_sample$sample_id)
diff_abundance$cells_per_sample <- cells_per_sample$cells_per_sample[matching_rows]
diff_abundance
```

```{r}
diff_abundance$percentage_cells <- diff_abundance$number_of_cells/diff_abundance$cells_per_sample
diff_abundance
```

```{r}
diff_abundance <- diff_abundance %>% 
  mutate(category = recode(cell_type, "CD14+_Monocytes" = "CD14+_monocytes"))

diff_abundance$cell_type[diff_abundance$cell_type == "CD14+_Monocytes"] <- "CD14+_monocytes"
```


```{r fig.width=16}


ggplot(diff_abundance, aes(x = factor(cell_type, level = c("CD14+_monocytes", "CD16+_monocytes", "Neutrophils", "Mast_cells", "CD1C+_DCs", "CLEC9Ahigh_DCs", "XCR1+_CLEC9Ahigh_DCs", "XCR1+_DCs", "LAMP3+_DCs", "LAMP3+_CCL17high_DCs", "pDCs", "mito_high_myeloid", "PECAM1high_CAV1high_ESAMhigh_LEC", "PECAM1high_NT5E+_LEC", "PECAM1low_CCL20high_LEC", "PECAM1high_CAV1high_LEC", "PECAM1low_EFNB2high_LEC","LYVE1+_LEC", "CCL19high_Fibroblasts", "CXCL12high_Fibroblasts", "CXCL12high_HDAC11+_Fibroblasts","PI16+_Fibroblasts", "PTPRGhigh_Fibroblasts", "Pericytes", "ITGA1high_ZEB2high_Pericytes","HDAC11+_cycling_Pericytes", "PEAR1+_Pericytes", "Myofibroblasts")), y = percentage_cells, fill = sample_type)) +
  geom_boxplot() + 
  labs(x = "Cell Type", y = "Proportion of cells (average across donors)", fill = "Treatment") +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 55, hjust = 1)) +
  scale_fill_manual(values=c("#E78AC3", "#A1D99B"))

ggplot(diff_abundance, aes(x = factor(cell_type, level = c("CD14+_monocytes", "CD16+_monocytes", "Neutrophils", "Mast_cells", "CD1C+_DCs", "CLEC9Ahigh_DCs", "XCR1+_CLEC9Ahigh_DCs", "XCR1+_DCs", "LAMP3+_DCs", "LAMP3+_CCL17high_DCs", "pDCs", "mito_high_myeloid", "PECAM1high_CAV1high_ESAMhigh_LEC", "PECAM1high_NT5E+_LEC", "PECAM1low_CCL20high_LEC", "PECAM1high_CAV1high_LEC", "PECAM1low_EFNB2high_LEC","LYVE1+_LEC", "CCL19high_Fibroblasts", "CXCL12high_Fibroblasts", "CXCL12high_HDAC11+_Fibroblasts","PI16+_Fibroblasts", "PTPRGhigh_Fibroblasts", "Pericytes", "ITGA1high_ZEB2high_Pericytes","HDAC11+_cycling_Pericytes", "PEAR1+_Pericytes", "Myofibroblasts")), y = percentage_cells, fill = sample_type)) +
  geom_boxplot() + 
  labs(x = "Cell Type", y = "Proportion of cells (average across donors)", fill = "Sample Type", col = "Sample ID") +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 55, hjust = 1),panel.background = element_rect(fill = "white", colour ="darkgray", size = 2), panel.grid.major = element_line(colour = "gray", size = 0.3))+
  scale_fill_manual(values=c("#E78AC3", "#A1D99B"))+
  geom_point(aes(col = sample_id), size = 2)

  
```

```{r}
diff_abundance1 = diff_abundance[diff_abundance$cell_type %in% c("CD14+_monocytes", "Neutrophils", "Mast_cells", "LAMP3+_DCs", "pDCs", "CCL19high_Fibroblasts", "CXCL12high_Fibroblasts","PI16+_Fibroblasts", "PTPRGhigh_Fibroblasts", "Pericytes", "Myofibroblasts"),]

cd19.model <- lm(percentage_cells ~ cell_type + sample_type, data = diff_abundance1)
temp <- diff_abundance1 %>%
  group_by(cell_type) %>%
  anova_test(percentage_cells ~ sample_type, error = cd19.model)

temp

cd19.model <- lm(percentage_cells ~ cell_type*sample_type, data = diff_abundance1)
temp1 <- diff_abundance1 %>%
  group_by(cell_type) %>%
  anova_test(percentage_cells ~ sample_type, error = cd19.model)

temp1


pwc <- diff_abundance1 %>% 
  group_by(cell_type) %>%
  emmeans_test(percentage_cells ~ sample_type, p.adjust.method = "bonferroni") 
pwc
 
pwc1 <- subset(pwc, select = c(cell_type, statistic, p))
pwc1 <- pwc1[order(pwc1$p),]
colnames(pwc1) <- c("cell_type", "statistic", "p_value")
pwc1
```


```{r}
diff_abundance_w <- as.data.frame(table(whole_da$sample_id, whole_da$cluster_name))
diff_abundance_s <- as.data.frame(table(slice_da$sample_id, slice_da$cluster_name))
diff_abundance <- rbind(diff_abundance_w, diff_abundance_s)


colnames(diff_abundance) <- c("sample_id", "cell_type", "number_of_cells")
diff_abundance <- separate(data = diff_abundance, col = "sample_id", c('donor_id', 'sample_type'), sep = "_", remove = FALSE)
diff_abundance$donor_id <- as.factor(diff_abundance$donor_id)
diff_abundance$sample_id <- as.factor(diff_abundance$sample_id)
diff_abundance$sample_type <- as.factor(diff_abundance$sample_type)
diff_abundance
```

to calculate the percentage we divide by the total number of cells in the sample

```{r}
cells_per_sample <- rbind(data.frame(table(slice_da$sample_id)), data.frame(table(whole_da$sample_id)))
colnames(cells_per_sample) <- c("sample_id", "cells_per_sample")
cells_per_sample
matching_rows <- match(diff_abundance$sample_id, cells_per_sample$sample_id)
diff_abundance$cells_per_sample <- cells_per_sample$cells_per_sample[matching_rows]
diff_abundance
```

```{r}
diff_abundance$percentage_cells <- diff_abundance$number_of_cells/diff_abundance$cells_per_sample
diff_abundance

```

```{r}
cd19.model <- lm(percentage_cells ~ cell_type + sample_type, data = diff_abundance)
temp <- diff_abundance %>%
  group_by(cell_type) %>%
  anova_test(percentage_cells ~ sample_type, error = cd19.model)

temp

cd19.model <- lm(percentage_cells ~ cell_type*sample_type, data = diff_abundance)
temp1 <- diff_abundance %>%
  group_by(cell_type) %>%
  anova_test(percentage_cells ~ sample_type, error = cd19.model)

temp1


pwc <- diff_abundance %>% 
  group_by(cell_type) %>%
  emmeans_test(percentage_cells ~ sample_type, p.adjust.method = "bonferroni") 
pwc
 

```

```{r}


ggplot(diff_abundance, aes(x = cell_type, y = percentage_cells, fill = sample_type)) +
  geom_boxplot() + 
  labs(x = "Cell Type", y = "Proportion of cells (average across donors)", fill = "Treatment", col = "Sample ID") +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 55, hjust = 1)) +
  scale_fill_manual(values=c("#E78AC3", "#A1D99B"))+
  geom_point(aes(col = sample_id), size = 2)

ggplot(diff_abundance, aes(x = cell_type, y = percentage_cells, fill = sample_type)) +
  geom_boxplot() + 
  labs(x = "Cell Type", y = "Proportion of cells (average across donors)", fill = "Sample Type", col = "Sample ID") +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 55, hjust = 1),panel.background = element_rect(fill = "white", colour ="darkgray", size = 2), panel.grid.major = element_line(colour = "gray", size = 0.3))+
  scale_fill_manual(values=c("#E78AC3", "#A1D99B"))+
  geom_point(aes(col = sample_id), size = 2)

#+geom_signif(stat = "identity", data = data.frame(x = c(0.12, 0.37), xend = c(0.17, ), y = c(0.4, 0.39), annotation = c("*", "**")), aes(x = x, xend = xend, y = y, yend = y, annotation = annotation))

  
```


```{r}
cd19.model <- lm(percentage_cells ~ cell_type + sample_type, data = diff_abundance)
temp <- diff_abundance %>%
  group_by(cell_type) %>%
  anova_test(percentage_cells ~ sample_type, error = cd19.model)

temp

cd19.model <- lm(percentage_cells ~ cell_type*sample_type, data = diff_abundance)
temp1 <- diff_abundance %>%
  group_by(cell_type) %>%
  anova_test(percentage_cells ~ sample_type, error = cd19.model)

temp1


pwc <- diff_abundance %>% 
  group_by(cell_type) %>%
  emmeans_test(percentage_cells ~ sample_type, p.adjust.method = "bonferroni") 
pwc
 
pwc1 <- subset(pwc, select = c(cell_type, statistic, p))
pwc1 <- pwc1[order(pwc1$p),]
colnames(pwc1) <- c("cell_type", "statistic", "p_value")
pwc1
```


```{r}

whole_da <- whole[,whole$cluster_name %in% c("Fibroblasts", "Endothelial_cells", "Myeloid_cells", "Pericytes")]
slice_da <- slice[,slice$cluster_name %in% c("Fibroblasts", "Endothelial_cells", "Myeloid_cells", "Pericytes")]

whole_da <- whole_da[whole_da$donor_id %in% c("PrLN4", "PrLN5", "PrLN6"), ]
slice_da <- slice_da[slice_da$sample_id %in% c("PrLN4_sCTRL", "PrLN5_sCTRL", "PrLN6_sCTRL"),]


assays(slice_da)$decontXcounts <- NULL

combined <- c(whole_da, slice_da)
```





